{
 "metadata": {
  "name": "",
  "signature": "sha256:63cc16054c19afe420fe419458902fbaf0be248a3a3d7ed28a7185154e78bac1"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from sklearn.feature_extraction.text import CountVectorizer"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 1
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from pandas import Series, DataFrame"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 2
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from weiboPredict import framework as fw"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 3
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "fw.loadData()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 5
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "train_context_clean = Series.from_csv('data/train_context_clean.csv')"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 6
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "fw.weibo_train_data['context_clean'] = train_context_clean"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 7
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "vectorizer = CountVectorizer(analyzer = \"word\",   \\\n",
      "                             tokenizer = None,    \\\n",
      "                             preprocessor = None, \\\n",
      "                             stop_words = None,   \\\n",
      "                             max_features = 100) "
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 8
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "off_train_data_features = vectorizer.fit_transform(fw.weibo_train_data[(fw.weibo_train_data['time']<'2014-12-01') & \n",
      "                                                                       (fw.weibo_train_data['time']>='2014-11-01')].context_clean)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 9
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "print vectorizer.get_feature_names()[99]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "u9ad8\n"
       ]
      }
     ],
     "prompt_number": 12
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "print u'\\u9ad8'"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\u9ad8\n"
       ]
      }
     ],
     "prompt_number": 13
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "off_train_data_features = off_train_data_features.toarray()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 14
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "off_train_data_features.shape"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 15,
       "text": [
        "(268228, 100)"
       ]
      }
     ],
     "prompt_number": 15
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from sklearn import linear_model"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 16
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import numpy as np"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 22
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "LR_off_train_forward = linear_model.LinearRegression()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 17
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "LR_off_train_forward.fit(off_train_data_features,fw.weibo_train_data[(fw.weibo_train_data.time<'2014-12-01') & \n",
      "                                                                       (fw.weibo_train_data['time']>='2014-11-01')].forward_count)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 18,
       "text": [
        "LinearRegression(copy_X=True, fit_intercept=True, normalize=False)"
       ]
      }
     ],
     "prompt_number": 18
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "off_train_data_forward = fw.weibo_train_data[(fw.weibo_train_data.time<'2014-12-01') & (fw.weibo_train_data['time']>='2014-11-01')].forward_count"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 28
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# The coefficients\n",
      "print 'Coefficients: \\n', LR_off_train_forward.coef_\n",
      "# The mean square error\n",
      "print \"Residual sum of squares: %.2f\" % np.mean((LR_off_train_forward.predict(off_train_data_features) - off_train_data_forward) ** 2)\n",
      "# Explained variance score: 1 is perfect prediction\n",
      "print 'Variance score: %.2f' % LR_off_train_forward.score(off_train_data_features, off_train_data_forward)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "Coefficients: \n",
        "[  1.28186683e+00   2.52042790e+00  -4.22240511e-01   3.00193074e-01\n",
        "   3.28058189e-01  -2.13529577e-01   3.27640197e-01  -3.49218136e-03\n",
        "  -7.51687673e-01   4.60758983e-01  -1.88868901e-01  -5.63483055e-01\n",
        "  -1.83995649e-01   2.93752622e-01  -1.97645411e-01  -1.79253901e-01\n",
        "  -2.06172987e-03   1.99173963e+00  -6.27022384e-01   6.15917705e-02\n",
        "   2.85340672e-01  -6.74744046e-01   3.22703508e-01  -1.93906428e-01\n",
        "  -7.66002106e-02  -3.09497938e-01  -3.45029518e-01   1.27872315e-01\n",
        "   3.79986469e-01   2.80729951e-01   8.66619194e-01   1.06845509e+00\n",
        "  -7.46800101e-01  -2.54218115e-01   2.73414376e+00  -1.47288430e-01\n",
        "  -1.01462568e+00   4.30055528e-01  -2.93930772e-01   3.81736449e-01\n",
        "  -3.66519904e-01   7.19002162e-01  -1.73325464e-01   3.24875449e-01\n",
        "  -4.16267259e-01   1.09335789e+00  -5.77318469e-01  -5.27257101e-02\n",
        "  -9.70816462e-01  -6.26686594e-01  -2.76311183e-01   9.30172159e-01\n",
        "   9.38379061e-01   1.63664274e-01   4.09739757e-01  -7.65133900e-01\n",
        "   3.31016884e-01  -2.58172161e-01   1.01198797e+00   1.77307286e+00\n",
        "   3.61991774e-02  -3.89654803e-01  -1.99074142e-01  -6.28604106e-01\n",
        "   9.55516554e-01  -7.35720041e-02  -4.58588853e-01  -2.23462541e-01\n",
        "   2.15590694e-01   8.00243970e-01   1.16599814e-01   2.19498519e+00\n",
        "  -2.66742669e-01  -4.14278221e-01  -1.95668630e-01  -9.36758394e-01\n",
        "  -4.17932271e-01  -3.16992248e-01   4.58014954e-01  -5.32611220e-01\n",
        "  -1.73775996e-01   7.05968914e-01   1.49674861e+00  -5.08237071e-01\n",
        "  -2.97077601e-01  -4.55922806e-01   2.16940611e-01   5.40526987e-02\n",
        "  -4.94225407e-01   1.04665258e+00   2.38683146e+00   2.94031871e+00\n",
        "  -9.84394767e-01   2.98732515e-01  -7.10995426e-02  -4.39575815e-01\n",
        "  -8.15566261e-01   2.66261832e-01  -4.48605915e-01  -2.35131478e-01]\n",
        "Residual sum of squares: 4199.03"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Variance score: 0.00"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n"
       ]
      }
     ],
     "prompt_number": 29
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "off_test_data_features = vectorizer.transform(fw.weibo_train_data[(fw.weibo_train_data['time']<='2014-12-31') & \n",
      "                                                                       (fw.weibo_train_data['time']>='2014-12-01')].context_clean)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 20
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "off_test_data_features = off_test_data_features.toarray()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 21
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "off_test_data_forward = fw.weibo_train_data[(fw.weibo_train_data.time<='2014-12-31') & (fw.weibo_train_data['time']>='2014-12-01')].forward_count"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 30
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "LR_off_test_forward = linear_model.LinearRegression()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 31
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "LR_off_test_forward.fit(off_test_data_features,off_test_data_forward)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 32,
       "text": [
        "LinearRegression(copy_X=True, fit_intercept=True, normalize=False)"
       ]
      }
     ],
     "prompt_number": 32
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# The coefficients\n",
      "print 'Coefficients: \\n', LR_off_test_forward.coef_\n",
      "# The mean square error\n",
      "print \"Residual sum of squares: %.2f\" % np.mean((LR_off_train_forward.predict(off_test_data_features) - off_test_data_forward) ** 2)\n",
      "# Explained variance score: 1 is perfect prediction\n",
      "print 'Variance score: %.2f' % LR_off_train_forward.score(off_test_data_features, off_test_data_forward)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "Coefficients: \n",
        "[-0.1171739   0.63265743 -0.57442636  1.08746757  0.20182595  0.33553953\n",
        "  0.81983983 -0.0211989  -0.66764881  0.12984936 -0.24389544  0.36226616\n",
        "  0.06604472  0.20332698  0.50035616  0.28453848  0.26696837  0.33407466\n",
        " -0.51198033 -0.79661462  0.39749733 -0.55538544  1.22117418 -0.22280062\n",
        "  0.27924758  0.06368221 -0.07490052 -0.27374777 -0.32021338 -0.67411311\n",
        "  0.50933275  0.23655788 -0.45525629  0.0289753   2.67015218 -0.24295205\n",
        " -0.72936656  0.14092331  0.13797876  0.08975027 -0.09392222  0.5397419\n",
        " -0.28049526  0.09968362 -0.30238392  1.40514351 -0.19270185  0.05052858\n",
        " -0.82315137 -0.25508024 -0.44458304  0.84594892  0.21754754 -0.05292221\n",
        " -0.67396415 -0.30519182  0.85181616  0.55422181 -0.4006845   1.06301328\n",
        " -0.05069149 -0.3123591  -0.09636032 -0.22507588  0.747482   -0.24339028\n",
        " -0.63067304 -0.05132197  0.0145011  -0.04456321  0.16510655  1.78731256\n",
        "  0.45439284 -0.14257295 -0.45809281 -0.37898258 -0.02587705 -0.21514858\n",
        "  0.52580601 -0.38464043  0.23830032 -0.31960683  2.22860638 -0.53478454\n",
        " -0.22331493 -0.30997626  0.08812845 -0.2962532   0.06009859 -0.25908015\n",
        "  1.85063471  3.14683021 -0.51927151  0.13069531 -0.44594174  0.06274401\n",
        " -0.04162402 -0.61135463 -0.6250505  -0.10684694]\n",
        "Residual sum of squares: 2902.73"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "Variance score: 0.00"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n"
       ]
      }
     ],
     "prompt_number": 33
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "off_predict_forward = LR_off_train_forward.predict(off_test_data_features)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 35
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "off_test_data_forward[20:40]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 55,
       "text": [
        "73     0\n",
        "81     0\n",
        "82     0\n",
        "119    0\n",
        "120    0\n",
        "121    0\n",
        "122    0\n",
        "123    0\n",
        "124    0\n",
        "183    0\n",
        "300    0\n",
        "301    0\n",
        "302    2\n",
        "303    1\n",
        "304    0\n",
        "305    0\n",
        "306    0\n",
        "307    0\n",
        "308    0\n",
        "309    2\n",
        "Name: forward_count, dtype: int64"
       ]
      }
     ],
     "prompt_number": 55
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "Series(off_predict_forward)[20:40].round()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 58,
       "text": [
        "20    1\n",
        "21    0\n",
        "22    1\n",
        "23    0\n",
        "24    2\n",
        "25    3\n",
        "26   -0\n",
        "27   -0\n",
        "28   -0\n",
        "29    2\n",
        "30    4\n",
        "31    2\n",
        "32    6\n",
        "33   -6\n",
        "34   -3\n",
        "35    5\n",
        "36   -1\n",
        "37    4\n",
        "38    3\n",
        "39    2\n",
        "dtype: float64"
       ]
      }
     ],
     "prompt_number": 58
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "off_test_data_forward.count()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 49,
       "text": [
        "272702"
       ]
      }
     ],
     "prompt_number": 49
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": []
    }
   ],
   "metadata": {}
  }
 ]
}